!pip install umap-learn
Requirement already satisfied: umap-learn in c:\users\felip\anaconda3\lib\site-packages (0.5.2) Requirement already satisfied: tqdm in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (4.62.3) Requirement already satisfied: numba>=0.49 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (0.51.2) Requirement already satisfied: scikit-learn>=0.22 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (1.0.2) Requirement already satisfied: pynndescent>=0.5 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (0.5.6) Requirement already satisfied: scipy>=1.0 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (1.7.3) Requirement already satisfied: numpy>=1.17 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (1.21.5) Requirement already satisfied: llvmlite<0.35,>=0.34.0.dev0 in c:\users\felip\anaconda3\lib\site-packages (from numba>=0.49->umap-learn) (0.34.0) Requirement already satisfied: setuptools in c:\users\felip\anaconda3\lib\site-packages (from numba>=0.49->umap-learn) (58.0.4) Requirement already satisfied: joblib>=0.11 in c:\users\felip\anaconda3\lib\site-packages (from pynndescent>=0.5->umap-learn) (1.1.0) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\felip\anaconda3\lib\site-packages (from scikit-learn>=0.22->umap-learn) (2.2.0) Requirement already satisfied: colorama in c:\users\felip\anaconda3\lib\site-packages (from tqdm->umap-learn) (0.4.4)
WARNING: Error parsing requirements for torch: [Errno 2] No such file or directory: 'c:\\users\\felip\\anaconda3\\lib\\site-packages\\torch-1.11.0.dist-info\\METADATA'
import re
import pandas as pd
from collections import defaultdict
import string
import multiprocessing
import os
import gensim
import sklearn
from sklearn import linear_model
from collections import Counter
import numpy as np
import scipy
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, cohen_kappa_score, classification_report
from nltk.tokenize import word_tokenize
import pickle
import umap
# word2vec
from gensim.models import Word2Vec, KeyedVectors, FastText
from gensim.models.phrases import Phrases, Phraser
from sklearn.model_selection import train_test_split
import logging
import nltk
from nltk.stem import PorterStemmer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)
path = "../../Data/train/df_us_train.pickle"
df_us_train = pickle.load(open(path, "rb"))
path = "../../Data/train/df_es_train.pickle"
df_es_train = pickle.load(open(path, "rb"))
n_labels_us = df_us_train["label"].unique().shape[0]
n_labels_es = df_es_train["label"].unique().shape[0]
from collections import Counter
punctuation = string.punctuation + "«»“”‘’…—"
stopwords_spanish = pd.read_csv(
'https://raw.githubusercontent.com/Alir3z4/stop-words/master/spanish.txt'
).values
stopwords_spanish = Counter(stopwords_spanish.flatten().tolist())
stopwords_english = pd.read_csv(
'https://raw.githubusercontent.com/Alir3z4/stop-words/master/english.txt'
).values
stopwords_english = Counter(stopwords_english.flatten().tolist())
from sklearn.pipeline import Pipeline
class StemmerTokenizer:
def __init__(self, stopwords):
self.ps = PorterStemmer()
self.sw = stopwords
def __call__(self, doc):
doc_tok = word_tokenize(doc)
doc_tok = [t for t in doc_tok if t not in self.sw]
return [self.ps.stem(t) for t in doc_tok]
# Inicializamos tokenizador
tokenizador_english = StemmerTokenizer(stopwords_english)
tokenizador_spanish = StemmerTokenizer(stopwords_spanish)
bow_english = CountVectorizer(
tokenizer= tokenizador_english,
ngram_range=(1,1),
)
bow_spanish= CountVectorizer(
tokenizer= tokenizador_spanish,
ngram_range=(1,1),
)
ct_bow_english = ColumnTransformer([
("BOW", bow_english, "text")
])
ct_bow_spanish = ColumnTransformer([
("BOW", bow_spanish, "text")
])
pipe_bow_english = Pipeline(
steps=[
('preprocessing', ct_bow_english)
]
)
pipe_bow_spanish = Pipeline(
steps=[
('preprocessing', ct_bow_spanish)
]
)
%%time
es_train_bow_sentence_embedding = pipe_bow_spanish.fit_transform(df_es_train)
es_train_bow_sentence_embedding_id = df_es_train["id"]
es_train_bow_sentence_embedding_label = df_es_train["label"].values
Wall time: 16.6 s
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import time
from sklearn.metrics.cluster import v_measure_score
from sklearn.metrics.cluster import rand_score, homogeneity_score, completeness_score, v_measure_score
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import normalized_mutual_info_score
import plotly.express as px
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.manifold import TSNE
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS
from sklearn.mixture import GaussianMixture
from analisis_clustering import clustering_report, distance_matrix, plot_distance_matrix, projection, get_projection_df, plot_confussion_matrix
%%time
sample_index_es = np.random.RandomState(0).choice(range(es_train_bow_sentence_embedding.shape[0]), 20000)
X_es = es_train_bow_sentence_embedding[sample_index_es]
labels_es = es_train_bow_sentence_embedding_label[sample_index_es]
Wall time: 13 ms
from sklearn.decomposition import TruncatedSVD
X_trans = TruncatedSVD(n_components=768, n_iter=7, random_state=42).fit_transform(X_es)
%%time
kmeans_es_report = clustering_report(
X_trans,
labels_es,
KMeans,
{"n_clusters": n_labels_es, "random_state": 0}
)
Fit... Fit (done) Predict... Predict (done) Silhouette... Silhouette (done) Metricas... Metricas (done) Wall time: 9.22 s
pickle.dump(kmeans_es_report, open("bow_kmeans_es_report.pickle", "wb"))
plot_confussion_matrix(labels_es, kmeans_es_report["clusters"])
%%time
kmeans_es_dm = distance_matrix(X_es, kmeans_es_report["clusters"], size=0.1)
Indices... Indices (done) Matriz de distancias... Matriz de distancias (done) Wall time: 53 ms
pickle.dump(kmeans_es_dm, open("bow_kmeans_es_dm.pickle", "wb"))
plot_distance_matrix(kmeans_es_dm, q1=0.05, q3=0.69, x0=-70, scale=1.02, num_tags=5)
%%time
umap_es_projection = projection(X_trans, umap.UMAP, {"random_state":0, "n_neighbors":7, "min_dist":0})
Reducer... Reducer (done) Wall time: 17.3 s
pickle.dump(umap_es_projection, open("bow_umap_es_projection.pickle", "wb"))
df_umap_es = get_projection_df(umap_es_projection, kmeans_es_report["clusters"], labels_es)
fig = px.scatter(df_umap_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
fig = px.scatter(df_umap_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")
%%time
tsne_es_projection = projection(
X_trans,
TSNE,
{"n_components": 2,
"learning_rate":'auto',
"init":'random'}
)
Reducer... Reducer (done) Wall time: 1min 20s
pickle.dump(tsne_es_projection, open("bow_tsne_es_projection.pickle", "wb"))
df_tsne_es = get_projection_df(tsne_es_projection, kmeans_es_report["clusters"], labels_es)
fig = px.scatter(df_tsne_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
fig = px.scatter(df_tsne_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")
pca_es_projection = projection(
X_trans,
PCA,
{"n_components": 2}
)
Reducer... Reducer (done)
pickle.dump(pca_es_projection, open("bow_pca_es_projection.pickle", "wb"))
df_pca_es = get_projection_df(pca_es_projection, kmeans_es_report["clusters"], labels_es)
fig = px.scatter(df_pca_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
fig = px.scatter(df_pca_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")
%%time
hc_es_report = clustering_report(
X_trans,
labels_es,
AgglomerativeClustering,
{"n_clusters": n_labels_es}
)
Fit... Fit (done) Predict... Predict (done) Silhouette... Silhouette (done) Metricas... Metricas (done) Wall time: 1min 42s
pickle.dump(hc_es_report, open("bow_hc_es_report.pickle", "wb"))
plot_confussion_matrix(labels_es, hc_es_report["clusters"])
%%time
hc_es_dm = distance_matrix(X_es, hc_es_report["clusters"], size=0.1)
Indices... Indices (done) Matriz de distancias... Matriz de distancias (done) Wall time: 55 ms
pickle.dump(hc_es_dm, open("bow_hc_es_dm.pickle", "wb"))
plot_distance_matrix(hc_es_dm, q1=0.05, q3=0.69, x0=-70, scale=1.02, num_tags=5)
df_umap_es = get_projection_df(umap_es_projection, hc_es_report["clusters"], labels_es)
fig = px.scatter(df_umap_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
fig = px.scatter(df_umap_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")
df_tsne_es = get_projection_df(tsne_es_projection, hc_es_report["clusters"], labels_es)
fig = px.scatter(df_tsne_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
fig = px.scatter(df_tsne_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")
df_pca_es = get_projection_df(pca_es_projection, hc_es_report["clusters"], labels_es)
fig = px.scatter(df_pca_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
fig = px.scatter(df_pca_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")
%%time
dbscan_es_report = clustering_report(
X_trans,
labels_es,
DBSCAN,
{"eps":0.4, "min_samples":5}
)
Fit... Fit (done) Predict... Predict (done) Silhouette... Silhouette (done) Metricas... Metricas (done) Wall time: 12 s
pickle.dump(dbscan_es_report, open("bow_dbscan_es_report.pickle", "wb"))
plot_confussion_matrix(labels_es, dbscan_es_report["clusters"])
%%time
dbscan_es_dm = distance_matrix(X_es, dbscan_es_report["clusters"], size=0.1)
Indices... Indices (done) Matriz de distancias... Matriz de distancias (done) Wall time: 73.1 ms
pickle.dump(dbscan_es_dm, open("bow_dbscan_es_dm.pickle", "wb"))
plot_distance_matrix(dbscan_es_dm, q1=0.05, q3=0.69, x0=-70, scale=1.02, num_tags=2)
df_umap_es = get_projection_df(umap_es_projection, dbscan_es_report["clusters"], labels_es)
fig = px.scatter(df_umap_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
fig = px.scatter(df_umap_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")
df_tsne_es = get_projection_df(tsne_es_projection, dbscan_es_report["clusters"], labels_es)
fig = px.scatter(df_tsne_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
fig = px.scatter(df_tsne_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")
df_pca_es = get_projection_df(pca_es_projection, dbscan_es_report["clusters"], labels_es)
fig = px.scatter(df_pca_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
fig = px.scatter(df_pca_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")
%%time
gm_es_report = clustering_report(
X_trans,
labels_es,
GaussianMixture,
{"n_components":n_labels_es, "random_state":0}
)
Fit... Fit (done) Predict... Predict (done) Silhouette... Silhouette (done) Metricas... Metricas (done) Wall time: 2min 35s
pickle.dump(gm_es_report, open("bow_gm_es_report.pickle", "wb"))
plot_confussion_matrix(labels_es, gm_es_report["clusters"])
%%time
gm_es_dm = distance_matrix(X_es, gm_es_report["clusters"], size=0.1)
Indices... Indices (done) Matriz de distancias... Matriz de distancias (done) Wall time: 58.1 ms
pickle.dump(gm_es_dm, open("bow_gm_es_dm.pickle", "wb"))
plot_distance_matrix(gm_es_dm, q1=0.05, q3=0.69, x0=-70, scale=1.02, num_tags=5)
df_umap_es = get_projection_df(umap_es_projection, gm_es_report["clusters"], labels_es)
fig = px.scatter(df_umap_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
fig = px.scatter(df_umap_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")
df_tsne_es = get_projection_df(tsne_es_projection, gm_es_report["clusters"], labels_es)
fig = px.scatter(df_tsne_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
fig = px.scatter(df_tsne_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")
df_pca_es = get_projection_df(pca_es_projection, gm_es_report["clusters"], labels_es)
fig = px.scatter(df_pca_es.sample(1000, random_state=0), x=0, y=1, color="cluster")
fig.show(renderer="notebook")
fig = px.scatter(df_pca_es.sample(1000, random_state=0), x=0, y=1, color="label")
fig.show(renderer="notebook")